GNNpaper - [FRAUD] df02 accuracy 0.9707 f1은 망함

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

# gnn
import torch
import torch_geometric
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

- fraudTrain

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

- df02

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02 = df02.reset_index()

- df_toy

df_toy=df02[:5].copy()
df_toy.cc_num = pd.Series([1,1,1,2,2])
df_toy

	index	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long
0	669418	2019-10-12 18:21:00	1	fraud_Haley, Jewess and Bechtelar	shopping_pos	7.53	Debra	Stark	F	686 Linda Rest	...	32.3836	-94.8653	24536	Multimedia programmer	1983-10-14	d313353fa30233e5fab5468e852d22fc	1350066071	32.202008	-94.371865
1	32567	2019-01-20 13:06:00	1	fraud_Turner LLC	travel	3.79	Judith	Moss	F	46297 Benjamin Plains Suite 703	...	39.5370	-83.4550	22305	Television floor manager	1939-03-09	88c65b4e1585934d578511e627fe3589	1327064760	39.156673	-82.930503
2	156587	2019-03-24 18:09:00	1	fraud_Klein Group	entertainment	59.07	Debbie	Payne	F	204 Ashley Neck Apt. 169	...	41.5224	-71.9934	4720	Broadcast presenter	1977-05-18	3bd9ede04b5c093143d5e5292940b670	1332612553	41.657152	-72.595751
3	1020243	2020-02-25 15:12:00	2	fraud_Monahan-Morar	personal_care	25.58	Alan	Parsons	M	0547 Russell Ford Suite 574	...	39.6171	-102.4776	207	Network engineer	1955-12-04	19e16ee7a01d229e750359098365e321	1361805120	39.080346	-103.213452
4	116272	2019-03-06 23:19:00	2	fraud_Kozey-Kuhlman	personal_care	84.96	Jill	Flores	F	639 Cruz Islands	...	41.9488	-86.4913	3104	Horticulturist, commercial	1981-03-29	a0c8641ca1f5d6e243ed5a2246e66176	1331075954	42.502065	-86.732664

5 rows × 23 columns

- df_toy 에서 time_difference 구함

고객1

df_toy.iloc[0].trans_date_trans_time.value - df_toy.iloc[1].trans_date_trans_time.value

22914900000000000

df_toy.iloc[0].trans_date_trans_time.value - df_toy.iloc[2].trans_date_trans_time.value

17453520000000000

df_toy.iloc[1].trans_date_trans_time.value - df_toy.iloc[2].trans_date_trans_time.value

-5461380000000000

고객2

df_toy.iloc[3].trans_date_trans_time.value - df_toy.iloc[4].trans_date_trans_time.value

30729180000000000

고객1,2

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result

groups = df_toy.groupby('cc_num')
edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
edge_index_list_plus_nparr

array([[                0,                 0,                 0],
       [                0,                 1, 22914900000000000],
       [                0,                 2, 17453520000000000],
       [                1,                 0, 22914900000000000],
       [                1,                 1,                 0],
       [                1,                 2,  5461380000000000],
       [                2,                 0, 17453520000000000],
       [                2,                 1,  5461380000000000],
       [                2,                 2,                 0],
       [                3,                 3,                 0],
       [                3,                 4, 30729180000000000],
       [                4,                 3, 30729180000000000],
       [                4,                 4,                 0]])

- df02에서 time_difference 구함

# t1 = time.time()
# groups = df02.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus02.npy', edge_index_list_plus_nparr)
# t2 = time.time()
# t2-t1

groups = df02.groupby("cc_num")

edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))

edge_index_list_plus02

array([[  2881,   2881,      0],
       [  2881,   3061,      0],
       [  2881,   4867,      0],
       ...,
       [212771, 212765,      0],
       [212771, 212769,      0],
       [212771, 212771,      0]])

theta = edge_index_list_plus02[:,2].mean()

edge_index_list_plus02 = np.load('edge_index_list_plus02.npy').astype(np.float64)

edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))

edge_index_list_plus02

array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00],
       [2.88100000e+03, 3.06100000e+03, 1.96061280e-01],
       [2.88100000e+03, 4.86700000e+03, 8.12918172e-01],
       ...,
       [2.12771000e+05, 2.12765000e+05, 9.97708695e-01],
       [2.12771000e+05, 2.12769000e+05, 9.99923197e-01],
       [2.12771000e+05, 2.12771000e+05, 0.00000000e+00]])

같은 cc_num별로.. 시간 차이를 계산했어.

weight = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))

weight

array([0.        , 0.19606128, 0.81291817, ..., 0.9977087 , 0.9999232 ,
       0.        ])

자꾸 아래처럼 하면 3열의 값이 다 0나와가지고;; 이상하게 해봄

edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))

edge_index_list_plus02 = np.column_stack((edge_index_list_plus02, weight))

edge_index_list_plus02

array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00, 0.00000000e+00],
       [2.88100000e+03, 3.06100000e+03, 1.90922400e+16, 1.96061280e-01],
       [2.88100000e+03, 4.86700000e+03, 2.42706000e+15, 8.12918172e-01],
       ...,
       [2.12771000e+05, 2.12765000e+05, 2.68800000e+13, 9.97708695e-01],
       [2.12771000e+05, 2.12769000e+05, 9.00000000e+11, 9.99923197e-01],
       [2.12771000e+05, 2.12771000e+05, 0.00000000e+00, 0.00000000e+00]])

edge_index_list_plus02 = np.delete(edge_index_list_plus02, 2, axis=1)

edge_index_list_plus02

array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00],
       [2.88100000e+03, 3.06100000e+03, 1.96061280e-01],
       [2.88100000e+03, 4.86700000e+03, 8.12918172e-01],
       ...,
       [2.12771000e+05, 2.12765000e+05, 9.97708695e-01],
       [2.12771000e+05, 2.12769000e+05, 9.99923197e-01],
       [2.12771000e+05, 2.12771000e+05, 0.00000000e+00]])

edge_index_list_plus02.shape

(65831594, 3)

edge_index_list_updated = edge_index_list_plus02.tolist()

np.array(edge_index_list_updated)[:,2].mean()

0.4536043999922591

mm = np.array(edge_index_list_updated)[:,2].mean()

selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape

torch.Size([2, 29970380])

tr/test

df02_tr,df02_test = sklearn.model_selection.train_test_split(df02, random_state=42)

df02_tr.shape, df02_test.shape

((160890, 23), (53630, 23))

N = len(df02)
train_mask = [i in df02_tr.index for i in range(N)]
test_mask = [i in df02_test.index for i in range(N)]

train_mask = np.array(train_mask)
test_mask = np.array(test_mask)

train_mask.shape, test_mask.shape

((214520,), (214520,))

data

x = torch.tensor(df02['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df02['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)

data

Data(x=[214520, 1], edge_index=[2, 29970380], y=[214520], train_mask=[214520], test_mask=[214520])


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(np.array(data.test_mask).sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9707

predicted_labels = pred[data.test_mask]
true_labels = data.y[data.test_mask]

precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Precision: 0.4854
Recall: 0.5000
F1 Score: 0.4926

/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))